library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
numberFamilies <- 10000
famsLoc <- "../RObjects/family_dfs/individualDataFrames/no_censoring"
families <- list()
i=1
for(file in list.files(famsLoc)){
load(str_interp("${famsLoc}/${file}"))
families[[i]]=family
i=i+1
}
#load in the summary table of PanelPro results
load(str_interp("../RObjects/summary_tables/summaryTable10000Families_no_censoring.Rdata"))
probandAge <- c()
for(i in 1:length(families)){
proband = families[[i]] %>% filter(isProband==1)
probandAge <- c(probandAge, proband$CurAge)
}
summaryTable$probandAge <- probandAge
summaryTable<- summaryTable %>% filter(carrierRiskUnaffectedInfoMasked < 1)
summaryTable<- summaryTable %>% filter(fullCarrierRisk < 1)
print(nrow(summaryTable))
## [1] 9646
logit <- function(x){
log(x)-log(1-x)
}
#offset <- 0.00001
summaryTable$logitFullCarrierRisk <- logit(summaryTable$fullCarrierRisk)
summaryTable$logitCarrierRiskUnaffectedInfoMasked <- logit(summaryTable$carrierRiskUnaffectedInfoMasked)
#logit spreads out those lower values and higher values
summaryTable$numAffectedRelativesIndicator0 <- ifelse(summaryTable$numAffectedRelatives == 0, 1, 0)
summaryTable$numAffectedRelativesIndicator1 <- ifelse(summaryTable$numAffectedRelatives == 1, 1, 0)
summaryTable$numAffectedRelativesIndicator2 <- ifelse(summaryTable$numAffectedRelatives == 2, 1, 0)
summaryTable$numAffectedRelativesIndicator3 <- ifelse(summaryTable$numAffectedRelatives >= 3, 1, 0)
summaryTable$group <- ifelse(summaryTable$numAffectedRelativesIndicator0 == 1, "0",
ifelse(summaryTable$numAffectedRelativesIndicator1 == 1, "1",
ifelse(summaryTable$numAffectedRelativesIndicator2 == 1, "2",
ifelse(summaryTable$numAffectedRelativesIndicator3 == 1, "3+","NA"))))
#checking for correlations that may induce collinearity
cor_df <- summaryTable[, c("logitFullCarrierRisk", "logitCarrierRiskUnaffectedInfoMasked", "numAffectedRelativesIndicator1", "numAffectedRelativesIndicator2","numAffectedRelativesIndicator3")]
print(cor(cor_df))
## logitFullCarrierRisk
## logitFullCarrierRisk 1.0000000
## logitCarrierRiskUnaffectedInfoMasked 0.8454426
## numAffectedRelativesIndicator1 -0.2066505
## numAffectedRelativesIndicator2 -0.2272025
## numAffectedRelativesIndicator3 0.3527884
## logitCarrierRiskUnaffectedInfoMasked
## logitFullCarrierRisk 0.8454426
## logitCarrierRiskUnaffectedInfoMasked 1.0000000
## numAffectedRelativesIndicator1 -0.2291213
## numAffectedRelativesIndicator2 -0.2065293
## numAffectedRelativesIndicator3 0.3671385
## numAffectedRelativesIndicator1
## logitFullCarrierRisk -0.2066505
## logitCarrierRiskUnaffectedInfoMasked -0.2291213
## numAffectedRelativesIndicator1 1.0000000
## numAffectedRelativesIndicator2 -0.0827825
## numAffectedRelativesIndicator3 -0.5392998
## numAffectedRelativesIndicator2
## logitFullCarrierRisk -0.2272025
## logitCarrierRiskUnaffectedInfoMasked -0.2065293
## numAffectedRelativesIndicator1 -0.0827825
## numAffectedRelativesIndicator2 1.0000000
## numAffectedRelativesIndicator3 -0.7108671
## numAffectedRelativesIndicator3
## logitFullCarrierRisk 0.3527884
## logitCarrierRiskUnaffectedInfoMasked 0.3671385
## numAffectedRelativesIndicator1 -0.5392998
## numAffectedRelativesIndicator2 -0.7108671
## numAffectedRelativesIndicator3 1.0000000
summaryTableFirstDegAff <- summaryTable %>% filter(firstDegreeAffectedFamilyMembersBinary == 1)
nrow(summaryTableFirstDegAff)
## [1] 3759
#take out the 0 indicator because we are restricting to families with >= 1 affected relative
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + numAffectedRelativesIndicator1 + numAffectedRelativesIndicator2, data=summaryTableFirstDegAff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## numAffectedRelativesIndicator1 + numAffectedRelativesIndicator2,
## data = summaryTableFirstDegAff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4137 -2.3804 -0.3293 3.1184 7.1742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.32378 0.05324 -81.216 < 2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 2.38396 0.02763 86.278 < 2e-16 ***
## numAffectedRelativesIndicator1 -1.42154 0.37640 -3.777 0.000161 ***
## numAffectedRelativesIndicator2 -2.24691 0.23491 -9.565 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.01 on 3755 degrees of freedom
## Multiple R-squared: 0.694, Adjusted R-squared: 0.6937
## F-statistic: 2838 on 3 and 3755 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point() +
geom_line(aes(y = predict(fitLinear), color = as.factor(group))) +
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "Number of Affected Relatives")
res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)
# get predicted values for training data
expit <- function(x){
exp(x) / (1 + exp(x))
}
summaryTableFirstDegAff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0971821656232819"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0742977280350966"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked , data=summaryTableFirstDegAff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked,
## data = summaryTableFirstDegAff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5343 -2.3744 -0.5196 3.1562 7.4438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.40967 0.05323 -82.84 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 2.45110 0.02707 90.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.049 on 3757 degrees of freedom
## Multiple R-squared: 0.6857, Adjusted R-squared: 0.6856
## F-statistic: 8196 on 1 and 3757 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "Proband MLH1")
res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)
# make a qq plot to assess normality
qqnorm(res)
qqline(res)
# get predicted values for training data
summaryTableFirstDegAff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0973747492407199"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.074229639381619"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + numAffectedRelatives , data=summaryTableFirstDegAff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## numAffectedRelatives, data = summaryTableFirstDegAff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.3863 -1.5190 -0.0745 1.4875 6.5016
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.607774 0.086106 -99.97 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.678548 0.024586 68.27 <2e-16 ***
## numAffectedRelatives 0.360479 0.006563 54.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.271 on 3756 degrees of freedom
## Multiple R-squared: 0.8257, Adjusted R-squared: 0.8256
## F-statistic: 8896 on 2 and 3756 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color = as.factor(numAffectedRelatives))) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", subtitle="First Degree Family Members Affected, Covariate: Number of Affected Relatives", color = "Number of Affected Relatives")
# get predicted values for training data
summaryTableFirstDegAff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.128149495702171"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0622038078710044"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge , data=summaryTableFirstDegAff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## probandAge, data = summaryTableFirstDegAff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5343 -2.3744 -0.5196 3.1562 7.4438
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.40967 0.05323 -82.84 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 2.45110 0.02707 90.53 <2e-16 ***
## probandAge NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.049 on 3757 degrees of freedom
## Multiple R-squared: 0.6857, Adjusted R-squared: 0.6856
## F-statistic: 8196 on 1 and 3757 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color =probandAge)) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Affected, Covariate: Proband Age", color = "Proband Age")
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge + numAffectedRelatives , data=summaryTableFirstDegAff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## probandAge + numAffectedRelatives, data = summaryTableFirstDegAff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.3863 -1.5190 -0.0745 1.4875 6.5016
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.607774 0.086106 -99.97 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.678548 0.024586 68.27 <2e-16 ***
## probandAge NA NA NA NA
## numAffectedRelatives 0.360479 0.006563 54.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.271 on 3756 degrees of freedom
## Multiple R-squared: 0.8257, Adjusted R-squared: 0.8256
## F-statistic: 8896 on 2 and 3756 DF, p-value: < 2.2e-16
Proband age again seems statistically insignificant
ggplot(data = summaryTableFirstDegAff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point() +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Affected, Covariates: Proband Age and Number of Affected Relatives", color = "Proband Age")
summaryTableFirstDegUnaff <- summaryTable %>% filter(firstDegreeAffectedFamilyMembersBinary == 0)
nrow(summaryTableFirstDegUnaff)
## [1] 5887
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + numAffectedRelativesIndicator0 + numAffectedRelativesIndicator1 + numAffectedRelativesIndicator2, data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## numAffectedRelativesIndicator0 + numAffectedRelativesIndicator1 +
## numAffectedRelativesIndicator2, data = summaryTableFirstDegUnaff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0761 -1.0164 -0.0547 0.9454 7.2737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.20780 0.04644 -155.208 < 2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.59751 0.02027 78.822 < 2e-16 ***
## numAffectedRelativesIndicator0 -0.19762 0.12099 -1.633 0.102
## numAffectedRelativesIndicator1 -0.58210 0.07831 -7.433 1.21e-13 ***
## numAffectedRelativesIndicator2 -0.62772 0.06377 -9.844 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.584 on 5882 degrees of freedom
## Multiple R-squared: 0.5881, Adjusted R-squared: 0.5879
## F-statistic: 2100 on 4 and 5882 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point() +
geom_line(aes(y = predict(fitLinear), color = as.factor(group))) +
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "Number of Affected Relatives")
res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)
# get predicted values for training data
expit <- function(x){
exp(x) / (1 + exp(x))
}
summaryTableFirstDegUnaff$predicted <- expit(predict(fitLinear))
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.00055016849877575"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00150412237671034"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked,
## data = summaryTableFirstDegUnaff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0394 -1.0562 -0.1008 0.9294 7.2865
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.17585 0.04630 -154.99 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.67414 0.01861 89.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.601 on 5885 degrees of freedom
## Multiple R-squared: 0.5791, Adjusted R-squared: 0.579
## F-statistic: 8096 on 1 and 5885 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color=probandMLH1Status)) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information Linear Regression", subtitle="First Degree Family Members Unaffected", color = "Proband MLH1")
res <- resid(fitLinear)
plot(fitted(fitLinear), res)
abline(0,0)
# make a qq plot to assess normality
qqnorm(res)
qqline(res)
# get predicted values for training data
summaryTableFirstDegUnaff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.000662526614050893"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00148619474565262"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + numAffectedRelatives , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## numAffectedRelatives, data = summaryTableFirstDegUnaff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2152 -0.9842 0.0017 0.9602 5.6139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.625417 0.069115 -124.80 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.441611 0.019539 73.78 <2e-16 ***
## numAffectedRelatives 0.196898 0.007277 27.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.51 on 5884 degrees of freedom
## Multiple R-squared: 0.6257, Adjusted R-squared: 0.6255
## F-statistic: 4917 on 2 and 5884 DF, p-value: < 2.2e-16
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color = as.factor(numAffectedRelatives))) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression", subtitle="First Degree Family Members Unaffected, Covariate: Number of Affected Relatives", color = "Number of Affected Relatives")
### Covariate: Proband Age
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## probandAge, data = summaryTableFirstDegUnaff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0394 -1.0562 -0.1008 0.9294 7.2865
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.17585 0.04630 -154.99 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.67414 0.01861 89.98 <2e-16 ***
## probandAge NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.601 on 5885 degrees of freedom
## Multiple R-squared: 0.5791, Adjusted R-squared: 0.579
## F-statistic: 8096 on 1 and 5885 DF, p-value: < 2.2e-16
Here the proband age seems more statistically signficant for families that do not have an affected first degree relative
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color =probandAge)) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Unaffected, Covariate: Proband Age", color = "Proband Age")
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked + probandAge + numAffectedRelatives , data=summaryTableFirstDegUnaff)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked +
## probandAge + numAffectedRelatives, data = summaryTableFirstDegUnaff)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.2152 -0.9842 0.0017 0.9602 5.6139
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.625417 0.069115 -124.80 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 1.441611 0.019539 73.78 <2e-16 ***
## probandAge NA NA NA NA
## numAffectedRelatives 0.196898 0.007277 27.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.51 on 5884 degrees of freedom
## Multiple R-squared: 0.6257, Adjusted R-squared: 0.6255
## F-statistic: 4917 on 2 and 5884 DF, p-value: < 2.2e-16
Proband age seems statistically significant here as well
ggplot(data = summaryTableFirstDegUnaff, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point() +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Linear Regression",subtitle="First Degree Family Members Unaffected, Covariates: Proband Age and Number of Affected Relatives", color = "Proband Age")
# get predicted values for training data
summaryTableFirstDegUnaff$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.000751750339751532"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00124419275115061"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"
First we will run a linear regression model on all families generated.
fitLinear <- lm(logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked , data=summaryTable)
print(summary(fitLinear))
##
## Call:
## lm(formula = logitFullCarrierRisk ~ logitCarrierRiskUnaffectedInfoMasked,
## data = summaryTable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5057 -1.5978 -0.2705 1.3052 7.9860
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.18092 0.03501 -148.0 <2e-16 ***
## logitCarrierRiskUnaffectedInfoMasked 2.36741 0.01523 155.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.427 on 9644 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7147
## F-statistic: 2.417e+04 on 1 and 9644 DF, p-value: < 2.2e-16
residuals <- residuals(fitLinear)
qqnorm(residuals)
qqline(residuals)
#the residuals should fit to the line in the qqplot if the data is normally distributed
range(summaryTable$fullCarrierRisk)
## [1] 7.020627e-08 9.999384e-01
range(summaryTable$carrierRiskUnaffectedInfoMasked)
## [1] 0.003292563 0.999862461
range(summaryTable$logitFullCarrierRisk)
## [1] -16.471828 9.695422
range(summaryTable$logitCarrierRiskUnaffectedInfoMasked)
## [1] -5.712791 8.891468
ggplot(data = summaryTable, aes(x= logitCarrierRiskUnaffectedInfoMasked, y= logitFullCarrierRisk)) +
geom_point(aes(color = as.factor(firstDegreeAffectedFamilyMembersBinary))) +
geom_line(aes(y = predict(fitLinear)))+
labs(x = "Carrier Risk Unaffected Info Masked (logit)", y= "Carrier Risk for Full Family Information (logit)", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Linear Regression", color = "1st Deg Aff")
# get predicted values for training data
summaryTable$predicted <- expit(predict(fitLinear)) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0336185884164781"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0362840655982899"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"
fitLoess <- loess(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTable, span = 0.2)
summary(fitLoess) #RSE is 0.00397 (seems good but I'm not sure)
## Call:
## loess(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked,
## data = summaryTable, span = 0.2)
##
## Number of Observations: 9646
## Equivalent Number of Parameters: 15.72
## Residual Standard Error: 0.09717
## Trace of smoother matrix: 17.38 (exact)
##
## Control settings:
## span : 0.2
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
loessVis = ggplot(data = summaryTable, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitLoess)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression")
print(loessVis)
# get predicted values for training data
summaryTable$predicted <- predict(fitLoess) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0617798157971618"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0245539261430636"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"
fitLoess <- loess(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegAff, span = 0.2)
summary(fitLoess) #RSE is 0.00397 (seems good but I'm not sure)
## Call:
## loess(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked,
## data = summaryTableFirstDegAff, span = 0.2)
##
## Number of Observations: 3759
## Equivalent Number of Parameters: 15.49
## Residual Standard Error: 0.1202
## Trace of smoother matrix: 17.13 (exact)
##
## Control settings:
## span : 0.2
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
loessVis = ggplot(data = summaryTableFirstDegAff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitLoess)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression", subtitle="First Degree Family Members Affected")
print(loessVis)
# get predicted values for training data
summaryTableFirstDegAff$predicted <- predict(fitLoess)
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.162655605163222"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0515684041885317"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"
fitLoess <- loess(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegUnaff, span = 0.2)
summary(fitLoess)
## Call:
## loess(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked,
## data = summaryTableFirstDegUnaff, span = 0.2)
##
## Number of Observations: 5887
## Equivalent Number of Parameters: 16.43
## Residual Standard Error: 0.02714
## Trace of smoother matrix: 18.16 (exact)
##
## Control settings:
## span : 0.2
## degree : 2
## family : gaussian
## surface : interpolate cell = 0.2
## normalize: TRUE
## parametric: FALSE
## drop.square: FALSE
loessVis = ggplot(data = summaryTableFirstDegUnaff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitLoess)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression", subtitle="First Degree Family Members Affected")
print(loessVis)
# get predicted values for training data
summaryTableFirstDegUnaff$predicted <- predict(fitLoess)
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.00224151612905406"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00148496323184557"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"
fitMedianPolished <- rlm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTable)
## Warning in rlm.default(x, y, weights, method = method, wt.method = wt.method, :
## 'rlm' failed to converge in 20 steps
print(summary(fitMedianPolished))
##
## Call: rlm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked,
## data = summaryTable)
## Residuals:
## Min 1Q Median 3Q Max
## -0.169798 -0.015675 0.002148 0.013571 0.820718
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -0.0211 0.0003 -62.2679
## carrierRiskUnaffectedInfoMasked 0.2004 0.0011 185.2869
##
## Residual standard error: 0.02109 on 9644 degrees of freedom
ggplot(data = summaryTable, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitMedianPolished)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression")
# get predicted values for training data
summaryTable$predicted <- predict(fitMedianPolished) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0223329738900075"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0500296559514577"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"
fitMedianPolished <- rlm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegAff)
print(summary(fitMedianPolished))
##
## Call: rlm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked,
## data = summaryTableFirstDegAff)
## Residuals:
## Min 1Q Median 3Q Max
## -0.83388 -0.07492 -0.01929 0.10017 0.23967
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -0.1876 0.0033 -57.0199
## carrierRiskUnaffectedInfoMasked 1.0692 0.0075 141.9900
##
## Residual standard error: 0.1227 on 3757 degrees of freedom
ggplot(data = summaryTableFirstDegAff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitMedianPolished)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "Comparing Carrier Risk for Masked Info and Full Family Information with Loess Regression", subtitle="First Degree Family Members Affected")
# get predicted values for training data
summaryTableFirstDegAff$predicted <- predict(fitMedianPolished) #need to do the expit to undo the logit transformation
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegAff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegAff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.176700240328606"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.340750072857346"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.165735567970205"
mse_adj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegAff$probandMLH1Status) - summaryTableFirstDegAff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0629664172225647"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0931782681832711"
fitMedianPolished <- rlm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked, data = summaryTableFirstDegUnaff)
print(summary(fitMedianPolished))
##
## Call: rlm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked,
## data = summaryTableFirstDegUnaff)
## Residuals:
## Min 1Q Median 3Q Max
## -7.211e-04 -4.547e-05 5.476e-06 3.295e-05 9.926e-01
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -0.0001 0.0000 -36.5957
## carrierRiskUnaffectedInfoMasked 0.0011 0.0000 144.7512
##
## Residual standard error: 5.707e-05 on 5885 degrees of freedom
ggplot(data = summaryTableFirstDegUnaff, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point(aes(color = probandMLH1Status)) +
geom_line(aes(y = predict(fitMedianPolished)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "MLH1 Carrier Risk for Masked Info and Full Family Information with Median Polished Regression", subtitle="First Degree Family Members Unaffected")
# get predicted values for training data
summaryTableFirstDegUnaff$predicted <- predict(fitMedianPolished)
# calculate average predicted value
avg_adj <- mean(summaryTableFirstDegUnaff$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTableFirstDegUnaff$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 9.21074572200889e-05"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.137705259344844"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0016986580601325"
mse_adj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTableFirstDegUnaff$probandMLH1Status) - summaryTableFirstDegUnaff$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.00169588917621271"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0380058380211608"
summaryTable$lowScoreIndicator <- ifelse(summaryTable$fullCarrierRisk < 0.1, 1, 0)
summaryTable$groupLowScore <- ifelse(summaryTable$lowScoreIndicator == 1, "0",
ifelse(summaryTable$lowScoreIndicator == 1, "1","NA"))
fit <- lm(fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked + lowScoreIndicator, data= summaryTable)
print(summary(fit))
##
## Call:
## lm(formula = fullCarrierRisk ~ carrierRiskUnaffectedInfoMasked +
## lowScoreIndicator, data = summaryTable)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62449 -0.01339 0.00924 0.02301 0.25211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.514848 0.005499 93.62 <2e-16 ***
## carrierRiskUnaffectedInfoMasked 0.233011 0.005915 39.39 <2e-16 ***
## lowScoreIndicator -0.548084 0.004811 -113.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08787 on 9643 degrees of freedom
## Multiple R-squared: 0.8314, Adjusted R-squared: 0.8314
## F-statistic: 2.378e+04 on 2 and 9643 DF, p-value: < 2.2e-16
ggplot(data = summaryTable, aes(x= carrierRiskUnaffectedInfoMasked, y= fullCarrierRisk)) +
geom_point() +
geom_line(aes(y = predict(fit)))+
labs(x = "Carrier Risk Unaffected Info Masked", y= "Carrier Risk for Full Family Information", title = "MLH1 Carrier Risk for Masked Info and Full Family Information", subtitle="First Degree Family Members Unaffected")
# get predicted values for training data
summaryTable$predicted <- predict(fit)
# calculate average predicted value
avg_adj <- mean(summaryTable$predicted) #adjusted means adjusted via prediction model
avg_unadj <- mean(summaryTable$carrierRiskUnaffectedInfoMasked)
avg_true <- mean(as.numeric(summaryTable$probandMLH1Status))
# print average predicted value
print(str_interp("Average predicted (adjusted) carrier risk score: ${avg_adj}"))
## [1] "Average predicted (adjusted) carrier risk score: 0.0637092689623149"
print(str_interp("Average masked (unadjusted) carrier risk score: ${avg_unadj}"))
## [1] "Average masked (unadjusted) carrier risk score: 0.216830850677365"
print(str_interp("Average number of true carriers of MLH1: ${avg_true}"))
## [1] "Average number of true carriers of MLH1: 0.0656230561890939"
mse_adj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$predicted)**2)
mse_unadj <- mean((as.numeric(summaryTable$probandMLH1Status) - summaryTable$carrierRiskUnaffectedInfoMasked)**2)
print(str_interp("Adjusted MSE: ${mse_adj}"))
## [1] "Adjusted MSE: 0.0222027186254911"
print(str_interp("Unadjusted MSE: ${mse_unadj}"))
## [1] "Unadjusted MSE: 0.0595062698042183"